Python Programming Tutorials

Help! Why does the code always read the last column of my dataset and throw an error


from collections import Counter
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn import svm, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

# calculate the percentage change in labels to normlaize the features

def process_data_for_labels(ticker):
    hm_days = 7
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)

    #  Create new columns to hold the values of the percentage change of that day
    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]

    df.fillna(0, inplace=True)
    return tickers, df

# process_data_for_labels('XOM')

def buy_sell_hold(*args):
    # Breaks down the series into an array of its values
    cols = [c for c in args]
    requirement = 0.025
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0

def extract_featuresets(ticker):
    # returns ticker and new data frame with forward data joined
    tickers, df = process_data_for_labels(ticker)

    # Apply buy_sell_hold to each series that has been broken down to an array by the for loop
    # The multiple parameters of list will be taken as a sum and consolidated as a single list
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold, df['{}_1d'.format(ticker)], df['{}_2d'.format(ticker)], df['{}_3d'.format(ticker)], df['{}_4d'.format(ticker)], df['{}_5d'.format(ticker)], df['{}_6d'.format(ticker)], df['{}_7d'.format(ticker)]))

    print df.columns

    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print 'Data spread:', Counter(str_vals)

    df.fillna(0, inplace=True)

    # replace infinity values with nan
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)

    # Returns a dataframe of the percentage change from yesterday (price today - price yesterday)
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)

    X = df_vals.values
    y = df['{}_target'.format(ticker)].values

    return X,y,df

def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    # clf = neighbors.KNeighborsClassifier()

    clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestCLassifier())])

    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print 'Accuracy', confidence
    predictions = clf.predict(X_test)
    # The spread is to see whether our predictions are skewed because the model thinks that a certain result is alot more accuarate
    print 'Predicted spread: ', Counter(predictions)

    return confidence

do_ml('AAPL')

My csv file is a table with multiple columns that end at the stock ticker name BBBY.

And this is the output and error I keep getting when I call do_ml('AAPL')


Traceback (most recent call last):
  File "finance_12.py", line 68, in <module>
    extract_featuresets('AAPL')
  File "finance_12.py", line 64, in extract_featuresets
    y = df['{}_target'.format(ticker)].values
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/frame.py", line 2059, in __getitem__
    return self._getitem_column(key)
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/frame.py", line 2066, in _getitem_column
    return self._get_item_cache(key)
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/generic.py", line 1386, in _get_item_cache
    values = self._data.get(item)
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/internals.py", line 3543, in get
    loc = self.items.get_loc(item)
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/indexes/base.py", line 2136, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 132, in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)
  File "pandas/index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)
  File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)
  File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)
KeyError: 'BBBY_target'

You must be logged in to post. Please login or register an account.

Ok I solved it. We shouldn't be naming the ticker variable as ticker in df_vals = df[[ticker for ticker in tickers]].pct_change() as it will replace the ticker parameter passed from the function as the last ticker name in your csv file